########################################################################
################    TEXT ANALYSIS VOX PRESS RELEASE   ##################
########################################################################
# BY: ALICE TIANBO ZHANG (alice.tianbo.zhang@gmail.com)
# UPDATED: 2021/10/20
#### LOAD PACKAGE ####
#packages <- c("tm", "plyr", "ggplot2", "wordCloud", "RColorBrewer", "SnowballC","tm.plugin.webmining","stm")
#install.packages(packages)
library(tm)
library(plyr)
library(ggplot2)
library(wordcloud)
library(RColorBrewer)
library(SnowballC)
library(tm.plugin.webmining)
library(stm)
#### SET DIRECTORY ####
rm(list = ls())
setwd("~/Dropbox/Research_Columbia/Renewables Voting (Urpelainen Zhang)/JOP/UZ_JOP2021_Replication")
# data.path <- "./Raw/Press/johannes_press"
# save.path <- "./Intermediate/4_vox/Press"
#### LOAD DATA ####
full_data <- read.csv("Data/Final/vox_district_year_cleaned.csv", header = T, sep = ",", strip.white = T, stringsAsFactors = F)
full_data <- full_data[complete.cases(full_data$text),]
#### PRE-PROCESS DATA ####
# Remote HTML
full_data$text <- gsub("<.*?>", "", full_data$text)
full_data$text <- gsub("&nbsp", "", full_data$text)
#write.csv(full_data, "Data/Final/vox_district_year_manualCheck.csv")
# Removes stopwords, numbers, punctuation, html, and then convert to lower case and stem words
processed <- textProcessor(documents = full_data$text, metadata = full_data, striphtml = TRUE)
# Plot the number of words and documents removed for different thresholds
plotRemoved(processed$documents, lower.thresh = seq(1, 200, by = 100))
# Remove infrequent words (words that do not appear in at least 10 documents were removed)
out <- prepDocuments(processed$documents, processed$vocab, processed$meta, lower.thresh = 10)
# Create file structure for STM package
docs <- out$documents # word indices and associated counts
vocab <- out$vocab # words associated with the word indices
meta <- out$meta # matrix with document covariates
stm_prev1 <- stm(docs, vocab, K = 20, prevalence = ~ party + s(year),
max.em.its = 75, data = meta, init.type = "Spectral")
## Model selection w/t selectModel
stm_selectM1 <- selectModel(docs, vocab, K = 20, prevalence = ~ party + s(year),
max.em.its = 75, data = meta, runs = 20, seed = 8458159)
plotModels(stm_selectM1)
stm_searchK1 <- searchK(docs, vocab, K=c(10, 20, 30, 40, 50),
prevalence = ~ party + s(year), data = meta)
plot(stm_searchK1)
plot.STM(stm_prev1, type = "labels", n = 10, topics = c(1:10), width = 150)
plot.STM(stm_prev1, type = "summary", n = 5, xlim = c(0, 0.3))
labelTopics(stm_prev1)
meta$party <- as.factor(meta$party)
prep1 <- estimateEffect(1:20 ~ party + s(year), stm_prev1,
meta = meta, uncertainty = "Global")
for (i in 1:20){
#pdf(paste0("Results/resultR/figures/stmPrev1_cloud_topic", i, ".pdf"))
cloud(stm_prev1, topic = i, width = 12)
dev.off()
}
plot.estimateEffect(prep1, covariate = "party", topics = c(1:20),
model = stm_prev1, method = "difference",
cov.value1 = "D", cov.value2 = "R",
xlab = "Republican    <---->    Democrat",
main = "Effect of Partisanship: Democrat vs. Republican",
xlim = c(-0.15, 0.15))
cloud(stm_prev1, topic = 1, width = 12)
warnings()
cloud(stm_prev1, topic = 1)
cloud(stm_prev1, topic = 1)
plot.STM(stm_prev1, type = "labels", n = 10, topics = c(1:10), width = 150)
install.versions(stm, '1.1.3')
stmurl <- "https://cran.r-project.org/src/contrib/Archive/stm/stm_1.1.3.tar.gz"
install.packages(stmurl, repos=NULL, type="source")
#install.packages(packages)
require(devtools)
install_version("stm", version = "1.1.3", repos = "http://cran.us.r-project.org")
install_version("stm", version = "1.1.3", repos = "https://cran.r-project.org/src/contrib/Archive/")
install_version("stm", version = "1.1.3", repos = "https://cran.r-project.org/src/contrib/Archive/stm")
install_version("stm", version = "1.1.3", repos = "https://cran.r-project.org/")
pkgbuild::check_build_tools(debug = TRUE)
packageurl <- "https://cran.r-project.org/src/contrib/Archive/stm/stm_1.1.3.tar.gz"
install.packages(packageurl, repos=NULL, type="source")
packageurl <- "https://cran.r-project.org/src/contrib/Archive/stm/stm_1.1.3.tar.gz"
install.packages(packageurl, repos=NULL, type="source")
require(devtools)
install_version("stm", version = "1.1.3", repos = "https://cran.r-project.org/")
cloud(stm_prev1, topic =1, width = 12)
cloud(stm_prev1, topic =2, width = 12)
plot.estimateEffect(prep2, covariate = "party", topics = c(1:20),
model = stm_prev2, method = "difference",
cov.value1 = "D", cov.value2 = "R",
xlab = "Republican    <---->    Democrat",
main = "Effect of Partisanship: Democrat vs. Republican",
xlim = c(-0.15, 0.15), labeltype = "custom")
plot.estimateEffect(prep1, covariate = "party", topics = c(1:20),
model = stm_prev1, method = "difference",
cov.value1 = "D", cov.value2 = "R",
xlab = "Republican    <---->    Democrat",
main = "Effect of Partisanship: Democrat vs. Republican",
xlim = c(-0.15, 0.15), labeltype = "custom")
plot.estimateEffect(prep1, covariate = "party", topics = c(1:20),
model = stm_prev1, method = "difference",
cov.value1 = "D", cov.value2 = "R",
xlab = "Republican    <---->    Democrat",
main = "Effect of Partisanship: Democrat vs. Republican",
xlim = c(-0.15, 0.15))
plot.STM(stm_prev1, type = "summary", n = 5, xlim = c(0, 0.3))
plot.STM(stm_prev1, type = "labels", n = 10, topics = c(1:10), width = 150)
setwd("~/Dropbox/Research_Columbia/Renewables Voting (Urpelainen Zhang)/JOP/UZ_JOP2021_Replication")
for (i in 1:20){
#pdf(paste0("Results/Figures/stmPrev1_cloud_topic", i, ".pdf"))
cloud(stm_prev1, topic = i, width = 12)
dev.off()
}
# Word Cloud
for (i in 1:20){
pdf(paste0("Results/Figures/stmPrev1_cloud_topic", i, ".pdf"))
cloud(stm_prev1, topic = i, width = 12)
dev.off()
}
